import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import warnings
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SequentialFeatureSelector
warnings.simplefilter(action="ignore", category=FutureWarning)
After our initial exploration and fine tuning of the business understanding, it is time to construct our final dataset prior to modeling. Here, we want to make sure to handle any integrity issues and cleaning, the engineering of new features, any transformations that we believe should happen (scaling, logarithms, normalization, etc.), and general preparation for modeling with sklearn.
vehicles = pd.read_csv('data/vehicles.csv')
vehicles.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 region 426880 non-null object 2 price 426880 non-null int64 3 year 425675 non-null float64 4 manufacturer 409234 non-null object 5 model 421603 non-null object 6 condition 252776 non-null object 7 cylinders 249202 non-null object 8 fuel 423867 non-null object 9 odometer 422480 non-null float64 10 title_status 418638 non-null object 11 transmission 424324 non-null object 12 VIN 265838 non-null object 13 drive 296313 non-null object 14 size 120519 non-null object 15 type 334022 non-null object 16 paint_color 296677 non-null object 17 state 426880 non-null object dtypes: float64(2), int64(2), object(14) memory usage: 58.6+ MB
vehicles = vehicles.convert_dtypes()
original_row_count = vehicles.shape[0]
# CALC: % of null values
vehicles.isnull().sum()/vehicles.shape[0]*100
id 0.000000 region 0.000000 price 0.000000 year 0.282281 manufacturer 4.133714 model 1.236179 condition 40.785232 cylinders 41.622470 fuel 0.705819 odometer 1.030735 title_status 1.930753 transmission 0.598763 VIN 37.725356 drive 30.586347 size 71.767476 type 21.752717 paint_color 30.501078 state 0.000000 dtype: float64
# remove a few features (columns) that are not relavent to the analysis
vehicles.drop(columns = ['id','region','VIN','state'], axis=1, inplace = True)
# before dropping NaN's
px.imshow(vehicles.isnull())